date()
## [1] "Mon Nov 13 17:24:47 2023"
# set directory
setwd("~/Desktop/Open data with R 2023/IODS-project")
Thoughts about this week 2:
After reading all the chapters 1-7, I am now more confident to use R studio. I also understand the language better and I can do research on the web to use new function that I did not know.
It is very exciting to see how efficient is this tool and to think about all the analyzes we can do. I am an open university student and I can already see how to use this tool at work :).
Exercise: WRANGLING Please find the script file in the Github: create_learning2014_week2
Exercise: DATA ANALYSIS
# Using the table from the course.
csv_table_read <- read.table("https://raw.githubusercontent.com/KimmoVehkalahti/Helsinki-Open-Data-Science/master/datasets/learning2014.txt", sep = ",", header = T)
# library
library("tidyverse")
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library("finalfit")
library("broom")
csv_table_read
## gender age attitude deep stra surf points
## 1 F 53 3.7 3.583333 3.375 2.583333 25
## 2 M 55 3.1 2.916667 2.750 3.166667 12
## 3 F 49 2.5 3.500000 3.625 2.250000 24
## 4 M 53 3.5 3.500000 3.125 2.250000 10
## 5 M 49 3.7 3.666667 3.625 2.833333 22
## 6 F 38 3.8 4.750000 3.625 2.416667 21
## 7 M 50 3.5 3.833333 2.250 1.916667 21
## 8 F 37 2.9 3.250000 4.000 2.833333 31
## 9 M 37 3.8 4.333333 4.250 2.166667 24
## 10 F 42 2.1 4.000000 3.500 3.000000 26
## 11 M 37 3.9 3.583333 3.625 2.666667 31
## 12 F 34 3.8 3.833333 4.750 2.416667 31
## 13 F 34 2.4 4.250000 3.625 2.250000 23
## 14 F 34 3.0 3.333333 3.500 2.750000 25
## 15 M 35 2.6 4.166667 1.750 2.333333 21
## 16 F 33 4.1 3.666667 3.875 2.333333 31
## 17 F 32 2.6 4.083333 1.375 2.916667 20
## 18 F 44 2.6 3.500000 3.250 2.500000 22
## 19 M 29 1.7 4.083333 3.000 3.750000 9
## 20 F 30 2.7 4.000000 3.750 2.750000 24
## 21 M 27 3.9 3.916667 2.625 2.333333 28
## 22 M 29 3.4 4.000000 2.375 2.416667 30
## 23 F 31 2.7 4.000000 3.625 3.000000 24
## 24 F 37 2.3 3.666667 2.750 2.416667 9
## 25 F 26 3.7 3.666667 1.750 2.833333 26
## 26 F 26 4.4 4.416667 3.250 3.166667 32
## 27 M 30 4.1 3.916667 4.000 3.000000 32
## 28 F 33 3.7 3.750000 3.625 2.000000 33
## 29 F 33 2.5 3.250000 2.875 3.500000 29
## 30 M 28 3.0 3.583333 3.000 3.750000 30
## 31 M 26 3.4 4.916667 1.625 2.500000 19
## 32 F 27 3.2 3.583333 3.250 2.083333 23
## 33 F 25 2.0 2.916667 3.500 2.416667 19
## 34 F 31 2.4 3.666667 3.000 2.583333 12
## 35 M 20 4.2 4.500000 3.250 1.583333 10
## 36 F 39 1.6 4.083333 1.875 2.833333 11
## 37 M 38 3.1 3.833333 4.375 1.833333 20
## 38 M 24 3.8 3.250000 3.625 2.416667 26
## 39 M 26 3.8 2.333333 2.500 3.250000 31
## 40 M 25 3.3 3.333333 1.250 3.416667 20
## 41 F 30 1.7 4.083333 4.000 3.416667 23
## 42 F 25 2.5 2.916667 3.000 3.166667 12
## 43 M 30 3.2 3.333333 2.500 3.500000 24
## 44 F 48 3.5 3.833333 4.875 2.666667 17
## 45 F 24 3.2 3.666667 5.000 2.416667 29
## 46 F 40 4.2 4.666667 4.375 3.583333 23
## 47 M 25 3.1 3.750000 3.250 2.083333 28
## 48 F 23 3.9 3.416667 4.000 3.750000 31
## 49 F 25 1.9 4.166667 3.125 2.916667 23
## 50 F 23 2.1 2.916667 2.500 2.916667 25
## 51 M 27 2.5 4.166667 3.125 2.416667 18
## 52 M 25 3.2 3.583333 3.250 3.000000 19
## 53 M 23 3.2 2.833333 2.125 3.416667 22
## 54 F 23 2.6 4.000000 2.750 2.916667 25
## 55 F 23 2.3 2.916667 2.375 3.250000 21
## 56 F 45 3.8 3.000000 3.125 3.250000 9
## 57 F 22 2.8 4.083333 4.000 2.333333 28
## 58 F 23 3.3 2.916667 4.000 3.250000 25
## 59 M 21 4.8 3.500000 2.250 2.500000 29
## 60 M 21 4.0 4.333333 3.250 1.750000 33
## 61 F 21 4.0 4.250000 3.625 2.250000 33
## 62 F 21 4.7 3.416667 3.625 2.083333 25
## 63 F 26 2.3 3.083333 2.500 2.833333 18
## 64 F 25 3.1 4.583333 1.875 2.833333 22
## 65 F 26 2.7 3.416667 2.000 2.416667 17
## 66 M 21 4.1 3.416667 1.875 2.250000 25
## 67 F 23 3.4 3.416667 4.000 2.833333 28
## 68 F 22 2.5 3.583333 2.875 2.250000 22
## 69 F 22 2.1 1.583333 3.875 1.833333 26
## 70 F 22 1.4 3.333333 2.500 2.916667 11
## 71 F 23 1.9 4.333333 2.750 2.916667 29
## 72 M 22 3.7 4.416667 4.500 2.083333 22
## 73 M 23 3.2 4.833333 3.375 2.333333 21
## 74 M 24 2.8 3.083333 2.625 2.416667 28
## 75 F 22 4.1 3.000000 4.125 2.750000 33
## 76 F 23 2.5 4.083333 2.625 3.250000 16
## 77 M 22 2.8 4.083333 2.250 1.750000 31
## 78 M 20 3.8 3.750000 2.750 2.583333 22
## 79 M 22 3.1 3.083333 3.000 3.333333 31
## 80 M 21 3.5 4.750000 1.625 2.833333 23
## 81 F 22 3.6 4.250000 1.875 2.500000 26
## 82 F 23 2.6 4.166667 3.375 2.416667 12
## 83 M 21 4.4 4.416667 3.750 2.416667 26
## 84 M 22 4.5 3.833333 2.125 2.583333 31
## 85 M 29 3.2 3.333333 2.375 3.000000 19
## 86 F 29 3.9 3.166667 2.750 2.000000 30
## 87 F 21 2.5 3.166667 3.125 3.416667 12
## 88 M 28 3.3 3.833333 3.500 2.833333 17
## 89 F 21 3.3 4.250000 2.625 2.250000 18
## 90 F 30 3.0 3.833333 3.375 2.750000 19
## 91 F 21 2.9 3.666667 2.250 3.916667 21
## 92 M 23 3.3 3.833333 3.000 2.333333 24
## 93 F 21 3.3 3.833333 4.000 2.750000 28
## 94 F 21 3.5 3.833333 3.500 2.750000 17
## 95 F 20 3.6 3.666667 2.625 2.916667 18
## 96 M 22 3.7 4.333333 2.500 2.083333 17
## 97 M 21 4.2 3.750000 3.750 3.666667 23
## 98 M 21 3.2 4.166667 3.625 2.833333 26
## 99 F 20 5.0 4.000000 4.125 3.416667 28
## 100 M 22 4.7 4.000000 4.375 1.583333 31
## 101 F 20 3.6 4.583333 2.625 2.916667 27
## 102 F 20 3.6 3.666667 4.000 3.000000 25
## 103 M 24 2.9 3.666667 2.750 2.916667 23
## 104 F 20 3.5 3.833333 2.750 2.666667 21
## 105 F 19 4.0 2.583333 1.375 3.000000 27
## 106 F 21 3.5 3.500000 2.250 2.750000 28
## 107 F 21 3.2 3.083333 3.625 3.083333 23
## 108 F 22 2.6 4.250000 3.750 2.500000 21
## 109 F 25 2.0 3.166667 4.000 2.333333 25
## 110 F 21 2.7 3.083333 3.125 3.000000 11
## 111 F 22 3.2 4.166667 3.250 3.000000 19
## 112 F 25 3.3 2.250000 2.125 4.000000 24
## 113 F 20 3.9 3.333333 2.875 3.250000 28
## 114 M 24 3.3 3.083333 1.500 3.500000 21
## 115 F 20 3.0 2.750000 2.500 3.500000 24
## 116 M 21 3.7 3.250000 3.250 3.833333 24
## 117 F 20 2.5 4.000000 3.625 2.916667 20
## 118 F 20 2.9 3.583333 3.875 2.166667 19
## 119 M 31 3.9 4.083333 3.875 1.666667 30
## 120 F 20 3.6 4.250000 2.375 2.083333 22
## 121 F 22 2.9 3.416667 3.000 2.833333 16
## 122 F 22 2.1 3.083333 3.375 3.416667 16
## 123 M 21 3.1 3.500000 2.750 3.333333 19
## 124 M 22 4.0 3.666667 4.500 2.583333 30
## 125 F 21 3.1 4.250000 2.625 2.833333 23
## 126 F 21 2.3 4.250000 2.750 3.333333 19
## 127 F 21 2.8 3.833333 3.250 3.000000 18
## 128 F 21 3.7 4.416667 4.125 2.583333 28
## 129 F 20 2.6 3.500000 3.375 2.416667 21
## 130 F 21 2.4 3.583333 2.750 3.583333 19
## 131 F 25 3.0 3.666667 4.125 2.083333 27
## 132 M 21 2.8 2.083333 3.250 4.333333 24
## 133 F 24 2.9 4.250000 2.875 2.666667 21
## 134 F 20 2.4 3.583333 2.875 3.000000 20
## 135 M 21 3.1 4.000000 2.375 2.666667 28
## 136 F 20 1.9 3.333333 3.875 2.166667 12
## 137 F 20 2.0 3.500000 2.125 2.666667 21
## 138 F 18 3.8 3.166667 4.000 2.250000 28
## 139 F 21 3.4 3.583333 3.250 2.666667 31
## 140 F 19 3.7 3.416667 2.625 3.333333 18
## 141 F 21 2.9 4.250000 2.750 3.500000 25
## 142 F 20 2.3 3.250000 4.000 2.750000 19
## 143 M 21 4.1 4.416667 3.000 2.000000 21
## 144 F 20 2.7 3.250000 3.375 2.833333 16
## 145 F 21 3.5 3.916667 3.875 3.500000 7
## 146 F 20 3.4 3.583333 3.250 2.500000 21
## 147 F 18 3.2 4.500000 3.375 3.166667 17
## 148 M 22 3.3 3.583333 4.125 3.083333 22
## 149 F 22 3.3 3.666667 3.500 2.916667 18
## 150 M 24 3.5 2.583333 2.000 3.166667 25
## 151 F 19 3.2 4.166667 3.625 2.500000 24
## 152 F 20 3.1 3.250000 3.375 3.833333 23
## 153 F 20 2.8 4.333333 2.125 2.250000 23
## 154 F 17 1.7 3.916667 4.625 3.416667 26
## 155 M 19 1.9 2.666667 2.500 3.750000 12
## 156 F 20 3.5 3.083333 2.875 3.000000 32
## 157 F 20 2.4 3.750000 2.750 2.583333 22
## 158 F 20 2.1 4.166667 4.000 3.333333 20
## 159 F 20 2.9 4.166667 2.375 2.833333 21
## 160 F 19 1.9 3.250000 3.875 3.000000 23
## 161 F 19 2.0 4.083333 3.375 2.833333 20
## 162 F 22 4.2 2.916667 1.750 3.166667 28
## 163 M 35 4.1 3.833333 3.000 2.750000 31
## 164 F 18 3.7 3.166667 2.625 3.416667 18
## 165 F 19 3.6 3.416667 2.625 3.000000 30
## 166 M 21 1.8 4.083333 3.375 2.666667 19
# analyze the structure of the dataset
str(csv_table_read)
## 'data.frame': 166 obs. of 7 variables:
## $ gender : chr "F" "M" "F" "M" ...
## $ age : int 53 55 49 53 49 38 50 37 37 42 ...
## $ attitude: num 3.7 3.1 2.5 3.5 3.7 3.8 3.5 2.9 3.8 2.1 ...
## $ deep : num 3.58 2.92 3.5 3.5 3.67 ...
## $ stra : num 3.38 2.75 3.62 3.12 3.62 ...
## $ surf : num 2.58 3.17 2.25 2.25 2.83 ...
## $ points : int 25 12 24 10 22 21 21 31 24 26 ...
# analyze the dimension of the dataset
dim(csv_table_read)
## [1] 166 7
# Missing data? No data is missing.
ff_glimpse(csv_table_read)
## $Continuous
## label var_type n missing_n missing_percent mean sd min
## age age <int> 166 0 0.0 25.5 7.8 17.0
## attitude attitude <dbl> 166 0 0.0 3.1 0.7 1.4
## deep deep <dbl> 166 0 0.0 3.7 0.6 1.6
## stra stra <dbl> 166 0 0.0 3.1 0.8 1.2
## surf surf <dbl> 166 0 0.0 2.8 0.5 1.6
## points points <int> 166 0 0.0 22.7 5.9 7.0
## quartile_25 median quartile_75 max
## age 21.0 22.0 27.0 55.0
## attitude 2.6 3.2 3.7 5.0
## deep 3.3 3.7 4.1 4.9
## stra 2.6 3.2 3.6 5.0
## surf 2.4 2.8 3.2 4.3
## points 19.0 23.0 27.8 33.0
##
## $Categorical
## label var_type n missing_n missing_percent levels_n levels
## gender gender <chr> 166 0 0.0 2 -
## levels_count levels_percent
## gender - -
# summary statistics for each variable
missing_glimpse(csv_table_read)
## label var_type n missing_n missing_percent
## gender gender <chr> 166 0 0.0
## age age <int> 166 0 0.0
## attitude attitude <dbl> 166 0 0.0
## deep deep <dbl> 166 0 0.0
## stra stra <dbl> 166 0 0.0
## surf surf <dbl> 166 0 0.0
## points points <int> 166 0 0.0
# Count per gender and percentage male / female
library("scales")
##
## Attaching package: 'scales'
##
## The following object is masked from 'package:purrr':
##
## discard
##
## The following object is masked from 'package:readr':
##
## col_factor
csv_table_read %>%
count(gender) %>%
mutate(total_percentage = n / nrow(csv_table_read)) %>%
mutate(total_percentage2 = percent(total_percentage))
## gender n total_percentage total_percentage2
## 1 F 110 0.6626506 66%
## 2 M 56 0.3373494 34%
# Mean and median for exercises points, and learning method per gender
summary(csv_table_read)
## gender age attitude deep
## Length:166 Min. :17.00 Min. :1.400 Min. :1.583
## Class :character 1st Qu.:21.00 1st Qu.:2.600 1st Qu.:3.333
## Mode :character Median :22.00 Median :3.200 Median :3.667
## Mean :25.51 Mean :3.143 Mean :3.680
## 3rd Qu.:27.00 3rd Qu.:3.700 3rd Qu.:4.083
## Max. :55.00 Max. :5.000 Max. :4.917
## stra surf points
## Min. :1.250 Min. :1.583 Min. : 7.00
## 1st Qu.:2.625 1st Qu.:2.417 1st Qu.:19.00
## Median :3.188 Median :2.833 Median :23.00
## Mean :3.121 Mean :2.787 Mean :22.72
## 3rd Qu.:3.625 3rd Qu.:3.167 3rd Qu.:27.75
## Max. :5.000 Max. :4.333 Max. :33.00
# The age varies from 17 to 55, mean is 25 and median 22. it suggests that there are some relatively higher values in the dataset
# The attitude varies from 1.4 to 5
# The points are from 7 to 33 and the mean is 22 and the median is 23. It suggests that there are some relatively lower values in the dataset
# we analyze the variables for both genders and females
# draw a scatter plot matrix of the variables in learning2014.
# [-1] excludes the first column (gender)
pairs(csv_table_read[-1])
# access the GGally and ggplot2 libraries
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggplot2)
# create a more advanced plot matrix with ggpairs()
p <- ggpairs(csv_table_read, mapping = aes(), lower = list(combo = wrap("facethist", bins = 20)))
p
# some data shows that there could be correlation between some variables
# Relationship between points and attitudes
csv_table_read %>%
ggplot(aes(x= attitude, y= points)) +
geom_point() +
facet_wrap(~ gender) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
#female model
female_data <- csv_table_read %>%
filter(gender == "F")
View(female_data)
# Fit a multiple linear model for females. Let's check how points are influenced by age, attitude and deep learning approach
female_fitmodel <- lm(points ~ age + attitude + deep, data = female_data)
# In this model I want to check if age, attitude and deep impact points without impacting each other.
# summary of std, p value and
summary(female_fitmodel)
##
## Call:
## lm(formula = points ~ age + attitude + deep, data = female_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.058 -3.263 0.622 4.003 10.533
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.59029 4.28982 3.168 0.00201 **
## age -0.01743 0.06983 -0.250 0.80338
## attitude 3.40151 0.70837 4.802 5.19e-06 ***
## deep -0.27355 0.96270 -0.284 0.77685
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.353 on 106 degrees of freedom
## Multiple R-squared: 0.1791, Adjusted R-squared: 0.1559
## F-statistic: 7.71 on 3 and 106 DF, p-value: 0.0001043
summary(female_fitmodel)$r.squared
## [1] 0.1791183
#"age," "attitude," and "deep" explains about 18% of the variation of "points"
# p value intercept: it is significant as very small (0.002) and seems to play a significant role in the regression model
# baseline of model in 13.59 (estimate), when no factors are taken into account.
# age is not significant and is not correlated with points
# deep is not significant and is not correlated with points
# attitude is significant and it seems to play a significant role on the points.
# for one point increase in the attitude, the points increase by 3.63 (estimate)
# High std shows that the estimate is not so precise. It could due to sample size.
# I decide to drop the deep and the age variables and keep only the attitude.
female_fitmodel2 <- lm(points ~ attitude, data = female_data)
summary(female_fitmodel2)
##
## Call:
## lm(formula = points ~ attitude, data = female_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.0557 -3.3486 0.6137 3.9819 10.3668
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 12.194 2.156 5.655 1.29e-07 ***
## attitude 3.389 0.701 4.835 4.44e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.307 on 108 degrees of freedom
## Multiple R-squared: 0.1779, Adjusted R-squared: 0.1703
## F-statistic: 23.38 on 1 and 108 DF, p-value: 4.442e-06
tidy(female_fitmodel2)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 12.2 2.16 5.66 0.000000129
## 2 attitude 3.39 0.701 4.83 0.00000444
summary(female_fitmodel2)$r.squared
## [1] 0.1779266
# p value is very low, same for the std, so this model is correct and justify the positive relation vs a positive attitude -> more points.
# rsquare is still quite low..
# The model doesn't provide a good fit for the data, and a significant portion of the variance is not explained. Is could be due to the sample size.
# autoplot: Residuals vs Fitted values, Normal QQ-plot and Residuals vs Leverage
# Identify issues with my regression model, such as non-linearity, non-normality, or influential data points
# autoplot doesnt knit.
#autoplot(female_fitmodel)
#autoplot(female_fitmodel2)
# I use plot function and which to get the desired plots.
plot(female_fitmodel,which = c(1,2,5))
plot(female_fitmodel2,which = c(1,2,5))
# we observe non normality at the end and beginning of the line in qq plot
# both models show that there are some points that are high leverage indicated on the residuals vs leverage
# male model
male_data <- csv_table_read %>%
filter(gender == "M")
View(male_data)
summary(male_data)
## gender age attitude deep
## Length:56 Min. :19.0 Min. :1.700 Min. :2.083
## Class :character 1st Qu.:21.0 1st Qu.:3.100 1st Qu.:3.396
## Mode :character Median :24.0 Median :3.400 Median :3.792
## Mean :26.8 Mean :3.443 Mean :3.725
## 3rd Qu.:29.0 3rd Qu.:3.900 3rd Qu.:4.083
## Max. :55.0 Max. :4.800 Max. :4.917
## stra surf points
## Min. :1.250 Min. :1.583 Min. : 9.00
## 1st Qu.:2.375 1st Qu.:2.312 1st Qu.:20.00
## Median :3.000 Median :2.625 Median :23.50
## Mean :2.964 Mean :2.704 Mean :23.48
## 3rd Qu.:3.531 3rd Qu.:3.167 3rd Qu.:28.25
## Max. :4.500 Max. :4.333 Max. :33.00
# Fit a multiple linear model for males. Let's check how points are influenced by age, attitude and deep learning approach
male_fitmodel <- lm(points ~ age + attitude + deep, data = male_data)
# summary of std, p value and
summary(male_fitmodel)
##
## Call:
## lm(formula = points ~ age + attitude + deep, data = male_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.8084 -3.3162 -0.0696 3.2195 9.9927
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 18.21897 6.06857 3.002 0.004112 **
## age -0.16602 0.08456 -1.963 0.054974 .
## attitude 4.31829 1.11699 3.866 0.000309 ***
## deep -1.38378 1.22006 -1.134 0.261916
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.271 on 52 degrees of freedom
## Multiple R-squared: 0.2718, Adjusted R-squared: 0.2298
## F-statistic: 6.47 on 3 and 52 DF, p-value: 0.000835
tidy(male_fitmodel)
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 18.2 6.07 3.00 0.00411
## 2 age -0.166 0.0846 -1.96 0.0550
## 3 attitude 4.32 1.12 3.87 0.000309
## 4 deep -1.38 1.22 -1.13 0.262
summary(male_fitmodel)$r.squared
## [1] 0.2718164
# similar results than for the female.
# All variables have a smaller p value than for in the female model.
# rsquare is higher as it explains 27% but it is still quite low. It could be due to the sample size.
# I decide to drop the deep and the age as variables and keep only the attitude.
male_fitmodel2 <- lm(points ~ attitude, data = male_data)
summary(male_fitmodel2)
##
## Call:
## lm(formula = points ~ attitude, data = male_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -16.6535 -2.9073 -0.5121 3.6974 10.2106
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 9.061 3.953 2.292 0.02581 *
## attitude 4.189 1.129 3.711 0.00049 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.411 on 54 degrees of freedom
## Multiple R-squared: 0.2032, Adjusted R-squared: 0.1884
## F-statistic: 13.77 on 1 and 54 DF, p-value: 0.0004897
tidy(male_fitmodel2)
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 9.06 3.95 2.29 0.0258
## 2 attitude 4.19 1.13 3.71 0.000490
summary(male_fitmodel)$r.squared
## [1] 0.2718164
# p value is very low, same for the std, so this model is correct and justify the positive relation vs a positive attitude -> more points.
# rsquare is higher as it explains 27% but it is still quite low
# The model doesn't provide a good fit for the data, and a significant portion of the variance is not explained. Is could be due to the sample size.
# autoplot: Residuals vs Fitted values, Normal QQ-plot and Residuals vs Leverage
# Identify issues with my regression model, such as non-linearity, non-normality, or influential data points
# autoplot doesnt knit !!
# autoplot(male_fitmodel)
# autoplot(male_fitmodel2)
# plot with the plot function
plot(male_fitmodel,which = c(1,2,5))
plot(male_fitmodel2,which = c(1,2,5))
#The red line in residuals vs fitted stays quite close to the 0 line which is good
# both models show non normality. it is observed at the beginning of the qq plot
# both models show that there are some points that are high leverage indicated on the residuals vs leverage
test_fit1 <- csv_table_read %>%
lm(points ~ deep, data = .)
library(ggfortify)
summary(test_fit1)
##
## Call:
## lm(formula = points ~ deep, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.6913 -3.6935 0.2862 4.9957 10.3537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.1141 3.0908 7.478 4.31e-12 ***
## deep -0.1080 0.8306 -0.130 0.897
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.913 on 164 degrees of freedom
## Multiple R-squared: 0.000103, Adjusted R-squared: -0.005994
## F-statistic: 0.01689 on 1 and 164 DF, p-value: 0.8967
tidy(test_fit1) # p value is small and significant
## # A tibble: 2 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 23.1 3.09 7.48 4.31e-12
## 2 deep -0.108 0.831 -0.130 8.97e- 1
summary(test_fit1)$r.squared # too low
## [1] 0.0001029919
test_fit2 <- csv_table_read %>%
lm(points ~ deep * gender, data = .)
library(ggfortify)
summary(test_fit2)
##
## Call:
## lm(formula = points ~ deep * gender, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.3247 -3.3338 0.3369 4.6242 10.6787
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.36387 3.91828 5.708 5.33e-08 ***
## deep -0.01001 1.06032 -0.009 0.992
## genderM 2.67476 6.41719 0.417 0.677
## deep:genderM -0.40787 1.71487 -0.238 0.812
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.922 on 162 degrees of freedom
## Multiple R-squared: 0.00922, Adjusted R-squared: -0.009127
## F-statistic: 0.5025 on 3 and 162 DF, p-value: 0.6811
tidy(test_fit2) # p value is small and significant
## # A tibble: 4 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 22.4 3.92 5.71 0.0000000533
## 2 deep -0.0100 1.06 -0.00944 0.992
## 3 genderM 2.67 6.42 0.417 0.677
## 4 deep:genderM -0.408 1.71 -0.238 0.812
summary(test_fit2)$r.squared # too low
## [1] 0.009220341
# Female vs Male participants
csv_table_read %>%
ggplot(aes(x=gender)) +
geom_bar()
# age chart and gender per age
csv_table_read %>%
ggplot(aes(x= age, fill = gender)) +
geom_bar()
# age chart distribution per gender
csv_table_read %>%
ggplot(aes(x= age)) +
facet_grid(~gender) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# age box plot distribution per gender
csv_table_read %>%
ggplot(aes(x= gender, y=age)) +
geom_boxplot()
# relationship and distribution between age, points, and gender
csv_table_read %>%
ggplot(aes(y = points, x = age, colour = gender)) +
geom_point() +
labs(title = "Distribution of points per age and gender")
# with this data we can observe the different age points that drives the mean up (vs the median).
# Distribution of the points per gender - histogram
csv_table_read %>%
ggplot(aes(x = points)) +
geom_histogram() +
facet_grid(~gender) +
labs(title = "Histogram of points by Gender")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Distribution of the points per gender - boxplot
csv_table_read %>%
ggplot(aes(y = points, x = gender, colour = gender)) +
geom_boxplot() +
labs(title = "Boxplot of points by Gender")
#QQ plot - points per gender
csv_table_read %>%
ggplot(aes(sample = points)) +
geom_qq() +
geom_qq_line(colour = "blue") +
facet_grid(~gender)
# mean points per gender - this is not significant
csv_table_read %>%
t.test(points ~ gender, data = .)
##
## Welch Two Sample t-test
##
## data: points by gender
## t = -1.1832, df = 107.84, p-value = 0.2393
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -3.0896905 0.7799502
## sample estimates:
## mean in group F mean in group M
## 22.32727 23.48214
# attitude vs gender
csv_table_read %>%
ggplot(aes(x=gender, y= attitude)) +
geom_boxplot()
# Type histogram
csv_table_read %>%
ggplot(aes(x = attitude)) +
geom_histogram() +
facet_grid(~ gender) +
labs(title = "Histogram of attitude by Gender")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# QQ plot: attitude per gender
csv_table_read %>%
ggplot(aes(sample = attitude)) +
geom_qq() +
geom_qq_line(colour = "blue") +
facet_grid(~gender)
# mean attitude per gender - This is significant and shows a difference between F and M on deep
csv_table_read %>%
t.test(attitude ~ gender, data = .)
##
## Welch Two Sample t-test
##
## data: attitude by gender
## t = -4.0932, df = 122.66, p-value = 7.657e-05
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -0.6718635 -0.2338508
## sample estimates:
## mean in group F mean in group M
## 2.990000 3.442857
# deep learning approach vs gender
# We could do that for all approach of learning
# Type histogram
csv_table_read %>%
ggplot(aes(x = deep)) +
geom_histogram() +
facet_grid(~ gender) +
labs(title = "Histogram of deep approach by Gender")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#Type boxplot
csv_table_read %>%
ggplot(aes(y = deep, x = gender, fill = gender)) +
geom_boxplot() +
labs(title = "Boxplot of deep Approach by Gender")
# QQ plot: deep per gender
csv_table_read %>%
ggplot(aes(sample = deep)) +
geom_qq() +
geom_qq_line(colour = "blue") +
facet_grid(~gender)
# mean deep per gender - This is quite significant and could show a correlation between the gender and the approach deep
csv_table_read %>%
t.test(deep ~ gender, data = .)
##
## Welch Two Sample t-test
##
## data: deep by gender
## t = -0.72082, df = 101.32, p-value = 0.4727
## alternative hypothesis: true difference in means between group F and group M is not equal to 0
## 95 percent confidence interval:
## -0.2546963 0.1189279
## sample estimates:
## mean in group F mean in group M
## 3.656818 3.724702
# does not seem to impact on points
csv_table_read %>%
ggplot(aes(x= age, y=points)) +
geom_point()+
facet_wrap(~ gender) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
# does not seem to impact on attitude
csv_table_read %>%
ggplot(aes(x= age, y=attitude)) +
geom_point() +
facet_wrap(~ gender) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
# deep learning approach vs age - no correlation
# We could do that for all approach of learning
csv_table_read %>%
ggplot(aes(x= age, y= deep)) +
geom_point() +
facet_wrap(~ gender) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
# deep learning approach vs points - The deep approach seems to have a correlation with the number of points
csv_table_read %>%
ggplot(aes(x= deep, y=points)) +
geom_point() +
facet_wrap(~ gender) +
geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'
test_fit <- csv_table_read %>%
lm(points ~ deep * gender, data = .)
library(ggfortify)
summary(test_fit)
##
## Call:
## lm(formula = points ~ deep * gender, data = .)
##
## Residuals:
## Min 1Q Median 3Q Max
## -15.3247 -3.3338 0.3369 4.6242 10.6787
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 22.36387 3.91828 5.708 5.33e-08 ***
## deep -0.01001 1.06032 -0.009 0.992
## genderM 2.67476 6.41719 0.417 0.677
## deep:genderM -0.40787 1.71487 -0.238 0.812
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.922 on 162 degrees of freedom
## Multiple R-squared: 0.00922, Adjusted R-squared: -0.009127
## F-statistic: 0.5025 on 3 and 162 DF, p-value: 0.6811
# deep seems to have a significant impact on "points."